Import der Daten und Vorverarbeitung

In [1]:
# import relevant modules
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
import plotly.express as px
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
import plotly.graph_objects as go
import glob
import sys
sys.path.append('../scripts/')
from analysis import get_correlation, peak_analysis, peak_ranges
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
In [2]:
# load only hashtags which are relevant
topics_df = pd.read_json('../../data/BTW17_Twitter/lda/hashtag_topics.json')
hashtags = topics_df['hashtag'].tolist()
In [3]:
# load hashtag timeseries
hashtag_df = pd.read_json('../../data/BTW17_Twitter/hashtags/hashtag_counts.json')
hashtag_df.head(3)
Out[3]:
date hashtag count
0 2017-05-29 150jahrekapital 1
1 2017-05-29 a19 1
2 2017-05-29 abschiebung 14
In [4]:
# load politicans metadata and keep only relevant data
persons_df = pd.read_csv('../../data/BTW17_Suggestions/btw_politicians_demographic.csv')
persons_df.drop(columns=['Unnamed: 0', 'Born', 'Bundesland', 'Age'], inplace=True)
persons_df['Name'] = persons_df['Name'].apply(lambda x: x.lower())
persons_df.rename(columns={'Name':'queryterm', 'Party':'party', 'Gender':'gender'}, inplace=True)
persons_df.head(3)
Out[4]:
queryterm party gender
0 wolfgang stefinger CSU male
1 kai whittaker CDU male
2 katrin albsteiger CSU female
In [5]:
cluster_cat = pd.read_csv('../../data/BTW17_Suggestions/suggestions/cluster_categories.csv', delimiter=',')
cluster_cat.drop(columns='Unnamed: 0', inplace=True)
cluster_cat['size'] = cluster_cat['sugg'].apply(lambda x: x.count(', ')+1)
cluster_cat.head(3)
Out[5]:
cluster category sugg size
0 -1 Rauschen büro lorenz caffier, peter uldall juhl, cloud ... 6217
1 0 Rauschen gebrochen, stadt land fluss, konzert für dich,... 346
2 1 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
In [6]:
# load suggestions timeseries
tmp = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
tmp['date'] = pd.to_datetime(tmp['date']).dt.date
suggestions_df = pd.DataFrame()
suggestions_df[['date', 'queryterm', 'suggestion', 'count']] = tmp.groupby(['date', 'queryterm', 'suggestion'], as_index=False).count()
suggestions_df = suggestions_df.merge(persons_df, how='left', on='queryterm')
In [7]:
# load vector similarites
similarity_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/vector_similarity.json')
similarity_df['hashtags'] = [hashtags for i in similarity_df.index]
similarity_df['suggestion'] = similarity_df['suggestion'].apply(lambda x: ' '.join(x))
In [8]:
# join suggestion cluster and  group again
suggestions_df = suggestions_df.merge(similarity_df, how='inner', on='suggestion')
suggestions_df = suggestions_df.groupby(['date', 'queryterm', 'party', 'gender', 'cluster'], as_index=False).sum('count')
suggestions_df.head(3)
Out[8]:
date queryterm party gender cluster count
0 2017-05-29 achim post SPD male 2 4
1 2017-05-29 achim post SPD male 5 12
2 2017-05-29 achim post SPD male 75 4
In [9]:
# remodel similarity cluster to hashtags
similarity_df = similarity_df.set_index(['suggestion', 'cluster']).apply(pd.Series.explode).reset_index()
similarity_df['similarity_scores'] = pd.to_numeric(similarity_df['similarity_scores']) 
similarity_df = similarity_df.groupby(['cluster', 'hashtags'], as_index=False).mean('similarity_scores')
similarity_df = similarity_df.merge(cluster_cat, how='left', on='cluster')

# filter out category rauschen
similarity_df = similarity_df[similarity_df['category']!='Rauschen'].reset_index(drop=True)
similarity_df.head(3)
Out[9]:
cluster hashtags similarity_scores category sugg size
0 1 afdwählen 0.008258 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
1 1 afghanistan -0.011473 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
2 1 altersarmut -0.008137 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
In [10]:
# prepare data for tlcc

# filter everything with sim_score < 0.5
sim_df = similarity_df[similarity_df['similarity_scores']>=0.5].reset_index(drop=True)

# group suggestions to cluster
cluster_df = suggestions_df.groupby(['date', 'cluster'], as_index=False).sum('count')
cluster_df.rename(columns={'count':'cluster_count'}, inplace=True)

# group suggestions per cluster and party
cluster_party_df = suggestions_df.groupby(['date', 'party', 'cluster'], as_index=False).sum('count')
cluster_party_df.rename(columns={'count':'cluster_count'}, inplace=True)

# group suggestions per cluster and gender
cluster_gender_df = suggestions_df.groupby(['date', 'gender', 'cluster'], as_index=False).sum('count')
cluster_gender_df.rename(columns={'count':'cluster_count'}, inplace=True)

hashtag_df.rename(columns={'count':'hashtag_count'}, inplace=True)
In [11]:
colors = px.colors.qualitative.Antique
colors.extend(px.colors.qualitative.Antique)

Time Lagged Cross Correlation

In [12]:
delays = []
for i in range(0, 71, 7):
    delays.append(i)
In [13]:
dfs = []
for i in delays:
    dfs.append(get_correlation(i, hashtag_df, cluster_df, cluster_gender_df, cluster_party_df, sim_df))
In [14]:
for i in range(len(dfs)):
    dfs[i].to_json(f'../../data/Analysis/df_{delays[i]}_delays.json')
In [15]:
# set to *.json to load all
input_loc = '../../data/Analysis/*delays.json'
input_files = glob.glob(input_loc)

dfs = []
for file in input_files:
    data = pd.read_json(file)
    data = data.merge(cluster_cat, how='left', on='cluster')
    #data = data[(data['pearsonr']>=0)&(data['p_value']<=0.05)&(data['gender']=='all')&(data['party']=='all')]
    data = data[(data['pearsonr']>=0)]
    dfs.append(data)

Deskriptives

In [16]:
print(f'Anzahl möglicher Kombinationen: {len(similarity_df[similarity_df["category"]!="Rauschen"])}')
print(f'Anzahl relevanter Kombinationen: {len(sim_df)}')
print(f'Anzahl Kombinationen pro Hashtag: {len(sim_df)/sim_df["hashtags"].nunique()}')
print(f'Anteil relevanter Kombinationen: {round(len(sim_df[sim_df["category"]!="Rauschen"])/len(similarity_df[similarity_df["category"]!="Rauschen"])*100,2)}%')
Anzahl möglicher Kombinationen: 115020
Anzahl relevanter Kombinationen: 1050
Anzahl Kombinationen pro Hashtag: 6.481481481481482
Anteil relevanter Kombinationen: 0.91%

Kategorien der Suchvorschlag Cluster

In [17]:
for category in sim_df['category'].unique():
    tmp = sim_df[sim_df['category']==category]
    print(f'Kategorie: {category}, Anzahl relevanter Kombinationen: {tmp.groupby(["cluster", "hashtags"], as_index=False).ngroups}')
Kategorie: Personen, Anzahl relevanter Kombinationen: 690
Kategorie: Orte, Anzahl relevanter Kombinationen: 41
Kategorie: Politik, Anzahl relevanter Kombinationen: 108
Kategorie: Medizin, Anzahl relevanter Kombinationen: 12
Kategorie: Organisationen, Anzahl relevanter Kombinationen: 18
Kategorie: Medien, Anzahl relevanter Kombinationen: 25
Kategorie: Wirtschaft, Anzahl relevanter Kombinationen: 52
Kategorie: Berufe, Anzahl relevanter Kombinationen: 104
In [18]:
sim_df.groupby('category', as_index=False)['similarity_scores'].mean()
Out[18]:
category similarity_scores
0 Berufe 0.720236
1 Medien 0.608040
2 Medizin 0.511738
3 Organisationen 0.572786
4 Orte 0.596317
5 Personen 0.588212
6 Politik 0.568747
7 Wirtschaft 0.578236
In [19]:
# load cluster_df and join categories
cluster_cat_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/cluster.json')
cluster_cat_df = cluster_cat_df.merge(cluster_cat, how='left', on='cluster')

tmp = pd.DataFrame()
tmp['Cluster'] = cluster_cat_df['cluster'].value_counts().index
tmp['Clustergröße'] = cluster_cat_df['cluster'].value_counts().values
tmp = tmp.merge(cluster_cat, how='left', left_on='Cluster', right_on='cluster')
tmp = tmp[tmp['category']!='Rauschen']
tmp2 = cluster_cat_df.groupby('category', as_index=False)['cluster'].nunique().sort_values(by='cluster', ascending=False)
tmp = tmp.merge(tmp2, on='category')
tmp['category'] = tmp.apply(lambda x: x['category'] + f' ({x["cluster_y"]} Cluster)', axis=1)

tmp.rename(columns={'category':'Kategorie'}, inplace=True)
fig = px.box(tmp[tmp['Kategorie']!='Rauschen'], x='Kategorie', y='Clustergröße',# points='all',
             color='Kategorie',
             template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()
In [20]:
fig = px.scatter(cluster_cat_df, x='t-SNE(x)', y='t-SNE(y)', color='category', hover_name='suggestion',
                 template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Regressions-Analysen unterschiedlicher Dimensionen (Einflüsse auf die mittleren Similarity Scores)

In [34]:
# regressionsanalyse
reg_df = suggestions_df.groupby(['party', 'gender', 'cluster'], as_index=False).sum()
tmp = similarity_df.groupby(['cluster', 'category'], as_index=False).mean()
reg_df = reg_df.merge(tmp, how='left', on='cluster')
reg_df.dropna(inplace=True)
reg_df = reg_df.reset_index(drop=True)
reg_df.head(3)
Out[34]:
party gender cluster count category similarity_scores size
0 AFD female 2 534 Wirtschaft 0.038410 20.0
1 AFD female 9 428 Orte 0.013585 225.0
2 AFD female 13 25 Wirtschaft 0.015126 306.0

Größe der Cluster

Keine Auswirkung auf den Similarity Score erkennbar

In [35]:
reg = smf.ols('similarity_scores ~ size', data=reg_df).fit()
A = np.identity(len(reg.params))
print(reg.f_test(A))
print(reg.rsquared)
reg.summary().tables[1]
<F test: F=array([[11.2580379]]), p=1.352780692071737e-05, df_denom=2.67e+03, df_num=2>
1.1975418258813875e-05
Out[35]:
coef std err t P>|t| [0.025 0.975]
Intercept 0.0027 0.001 3.637 0.000 0.001 0.004
size 2.236e-06 1.25e-05 0.179 0.858 -2.23e-05 2.68e-05

Partei Angehörigkeit der Person im Suchterm

Keine Auswirkung auf den Similarity Score erkennbar

In [36]:
reg = smf.ols('similarity_scores ~ C(party)', data=reg_df).fit()
A = np.identity(len(reg.params))
print(reg.f_test(A))
print(reg.rsquared)
reg.summary().tables[1]
<F test: F=array([[2.4613411]]), p=0.006295783343604179, df_denom=2.66e+03, df_num=10>
0.0008189774284228513
Out[36]:
coef std err t P>|t| [0.025 0.975]
Intercept 0.0043 0.003 1.486 0.137 -0.001 0.010
C(party)[T.CDU] -0.0021 0.003 -0.667 0.505 -0.008 0.004
C(party)[T.CSU] -0.0019 0.003 -0.565 0.572 -0.009 0.005
C(party)[T.DIE LINKE] -0.0025 0.003 -0.749 0.454 -0.009 0.004
C(party)[T.FDP] 0.0008 0.004 0.193 0.847 -0.007 0.008
C(party)[T.GRÜNE] -0.0010 0.003 -0.291 0.771 -0.007 0.006
C(party)[T.Parteilos] -0.0012 0.005 -0.241 0.809 -0.011 0.009
C(party)[T.SPD] -0.0013 0.003 -0.410 0.682 -0.007 0.005
C(party)[T.SSW] 0.0034 0.012 0.283 0.777 -0.020 0.027
C(party)[T.fraktionslos] -0.0041 0.007 -0.585 0.559 -0.018 0.010

Geschlecht der Person im Suchterm

Keine Auswirkung auf den Similarity Score erkennbar

In [37]:
reg = smf.ols('similarity_scores ~ C(gender)', data=reg_df).fit()
A = np.identity(len(reg.params))
print(reg.f_test(A))
print(reg.rsquared)
reg.summary().tables[1]
<F test: F=array([[11.54860668]]), p=1.0141573651000649e-05, df_denom=2.67e+03, df_num=2>
0.00022808162873377835
Out[37]:
coef std err t P>|t| [0.025 0.975]
Intercept 0.0022 0.001 2.369 0.018 0.000 0.004
C(gender)[T.male] 0.0009 0.001 0.780 0.436 -0.001 0.003

Kategorie der Cluster

Auswirkungen tbd

In [38]:
reg = smf.ols('similarity_scores ~ C(category)', data=reg_df).fit()
A = np.identity(len(reg.params))
print(reg.f_test(A))
print(reg.rsquared)
reg.summary().tables[1]
<F test: F=array([[14.97259027]]), p=1.2972628930425782e-28, df_denom=2.66e+03, df_num=11>
0.05042725711080598
Out[38]:
coef std err t P>|t| [0.025 0.975]
Intercept 0.0086 0.003 2.575 0.010 0.002 0.015
C(category)[T.Justiz] -0.0071 0.006 -1.253 0.210 -0.018 0.004
C(category)[T.Kultur] -0.0052 0.009 -0.569 0.570 -0.023 0.013
C(category)[T.Medien] -0.0304 0.004 -7.081 0.000 -0.039 -0.022
C(category)[T.Medizin] -0.0204 0.006 -3.540 0.000 -0.032 -0.009
C(category)[T.Organisationen] -0.0046 0.004 -1.060 0.289 -0.013 0.004
C(category)[T.Orte] -0.0100 0.004 -2.798 0.005 -0.017 -0.003
C(category)[T.Personen] -0.0043 0.003 -1.232 0.218 -0.011 0.003
C(category)[T.Politik] -0.0032 0.004 -0.875 0.382 -0.010 0.004
C(category)[T.Privatleben] 0.0057 0.005 1.264 0.206 -0.003 0.015
C(category)[T.Wirtschaft] 0.0032 0.004 0.775 0.438 -0.005 0.011

Häufigkeit der Cluster

Keine Auswirkung erkennbar

In [39]:
reg = smf.ols('similarity_scores ~ count', data=reg_df).fit()
A = np.identity(len(reg.params))
print(reg.f_test(A))
print(reg.rsquared)
reg.summary().tables[1]
<F test: F=array([[12.19366108]]), p=5.350995615545452e-06, df_denom=2.67e+03, df_num=2>
0.0007074975627996416
Out[39]:
coef std err t P>|t| [0.025 0.975]
Intercept 0.0030 0.001 4.936 0.000 0.002 0.004
count -5.102e-08 3.71e-08 -1.374 0.170 -1.24e-07 2.18e-08

Ergebnisse der TLCC

Fragestellung: Wie lange dauert die Durchdringung im Durchschnitt und nach den jeweiligen Dimensionen? Messung: TLCC mit Pearson R und p-Wert

Betrachtung über alle Kombinationen

In [40]:
# übersicht der korrelationen und deren p-werte pro time lag
delay_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    delay_list.append(int(delays[i]/7))
    df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']=='all')]
    r_list.append(round(df['pearsonr'].mean(),3))
    p_values = df['p_value'].to_numpy()
    p_list.append(round(stats.combine_pvalues(p_values)[1],3))
    
tmp = pd.DataFrame(data={'Time Lag (in Wochen)': delay_list, 'Pearson R': r_list, 'P-Wert': p_list})
tmp
Out[40]:
Time Lag (in Wochen) Pearson R P-Wert
0 0 0.120 0.0
1 1 0.122 0.0
2 2 0.119 0.0
3 3 0.131 0.0
4 4 0.141 0.0
5 5 0.131 0.0
6 6 0.161 0.0
7 7 0.170 0.0
8 8 0.163 0.0
9 9 0.171 0.0
10 10 0.116 0.0

Sämtliche Korrelationen sind signifikant (p<0.05), deshalb Betrachtung im Plot.

In [41]:
fig = px.line(tmp, x='Time Lag (in Wochen)', y='Pearson R',
              template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Plateau zwischen 6-9 Wochen, allerdings sehr geringe Korrelation. Ausnahmen bei 9 Wochen sind nur wenige:

In [42]:
tmp = dfs[9][(dfs[9]['gender']=='all')&(dfs[9]['party']=='all')]
tmp = tmp[tmp['pearsonr']>=0.5]
tmp.sort_values(by='pearsonr', ascending=False)[['cluster', 'hashtags', 'category_x', 'pearsonr', 'similarity_scores']]
Out[42]:
cluster hashtags category_x pearsonr similarity_scores
9814 711 bureg Personen 0.735616 0.675571
2482 387 btw2017 Personen 0.666920 0.660750
4679 505 btw2017 Personen 0.651856 0.512437
2508 387 bundestagswahl Personen 0.627181 0.692417
4705 505 bundestagswahl Personen 0.622913 0.554438
7513 620 btw Personen 0.589454 0.544000
7656 620 linke Personen 0.542172 0.563857
2950 387 traudichdeutschland Personen 0.541448 0.603667
2534 387 darumgrün Personen 0.539766 0.552000
4380 490 populismus Organisationen 0.533656 0.564286
7422 594 steineke Personen 0.513590 0.539500
2469 387 btw17 Personen 0.505943 0.603667
8397 671 islamisierung Orte 0.503226 0.658875

Betrachtung nach Kategorie der Cluster

In [43]:
delay_list = []
categories = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for category in set(similarity_df['category']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']=='all')]
        categories.append(category)
        r_list.append(df[df['category_x']==category]['pearsonr'].mean())
        p_values = df[df['category_x']==category]['p_value'].to_numpy()
        p_list.append(stats.combine_pvalues(p_values)[1])

tmp = pd.DataFrame(data={'Delay': delay_list, 'Kategorie': categories, 'Pearson R': r_list, 'P-Wert': p_list})
tmp = tmp.dropna()
tmp = tmp[tmp['Kategorie']!='Rauschen']

fig = make_subplots(rows=1, cols=2, subplot_titles=('Pearson R', 'P-Werte'),
                    shared_yaxes=True, horizontal_spacing=0.15)

fig.add_trace(go.Heatmap(z=tmp['Pearson R'], x=tmp['Kategorie'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu, colorbar_x=0.45), row=1, col=1)

fig.add_trace(go.Heatmap(z=tmp['P-Wert'], x=tmp['Kategorie'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu_r), row=1, col=2)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Höchste signifikante Korrelation im Bereich zwischen 6-9 Wochen. Insbesondere Personen und Wirtschaft. Bestätigung der Ergebnisse der Gesamtbetrachtung.

Betrachtung nach Geschlecht der Personen im Suchterm

In [44]:
delay_list = []
gender_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for gender in set(suggestions_df['gender']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']!='all')&(dfs[i]['party']=='all')]
        gender_list.append(gender)
        df = df[df['category_x']!='Rauschen']
        r_list.append(df[df['gender']==gender]['pearsonr'].mean())
        p_values = df[df['gender']==gender]['p_value'].to_numpy()
        p_list.append(stats.combine_pvalues(p_values)[1])

tmp = pd.DataFrame(data={'Delay': delay_list, 'Gender': gender_list, 'Pearson R': r_list, 'P-Wert': p_list})
tmp = tmp.dropna()

fig = make_subplots(rows=1, cols=2, subplot_titles=('Pearson R', 'P-Werte'),
                    shared_yaxes=True, horizontal_spacing=0.15)

fig.add_trace(go.Heatmap(z=tmp['Pearson R'], x=tmp['Gender'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu, colorbar_x=0.45), row=1, col=1)

fig.add_trace(go.Heatmap(z=tmp['P-Wert'], x=tmp['Gender'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu_r), row=1, col=2)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Bestätigung der Ergebnisse aus der Gesamtbetrachtung.

Betrachtung nach Partei der Person im Suchterm

In [45]:
delay_list = []
party_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for party in set(suggestions_df['party']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']!='all')]
        party_list.append(party)
        df = df[df['category_x']!='Rauschen']
        r_list.append(df[df['party']==party]['pearsonr'].mean())
        p_values = df[df['party']==party]['p_value'].to_numpy()
        p_list.append(stats.combine_pvalues(p_values)[1])

tmp = pd.DataFrame(data={'Delay': delay_list, 'Parteien': party_list, 'Pearson R': r_list, 'P-Wert': p_list})
tmp = tmp.dropna()

fig = make_subplots(rows=1, cols=2, subplot_titles=('Pearson R', 'P-Werte'),
                    shared_yaxes=True, horizontal_spacing=0.15)

fig.add_trace(go.Heatmap(z=tmp['Pearson R'], x=tmp['Parteien'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu, colorbar_x=0.45), row=1, col=1)

fig.add_trace(go.Heatmap(z=tmp['P-Wert'], x=tmp['Parteien'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu_r), row=1, col=2)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Grobe Bestätigung der Ergebnisse der Gesamtbetrachtung. Ausnahme AfD: Immer stark korreliert, ggf Partei mit starker Abhängigkeit von Twitter?

t-Tests der Mittelwerte der Tage um die Peaks

In [46]:
peaks_df = pd.read_json('../../data/BTW17_Twitter/peaks/peak_dates.json')
peaks_df['num_peaks'] = peaks_df.apply(lambda x: len(x['lda_dates']) / 7, axis=1)
peaks_df[['peak_start', 'peak_end']] = peaks_df.apply(peak_ranges, axis=1)
peaks_df.drop(columns=['index', 'num_peaks', 'lda_dates'], inplace=True)
peaks_df = peaks_df.set_index(['hashtag']).apply(pd.Series.explode).reset_index()
peaks_df.head(3)
Out[46]:
hashtag peak_start peak_end
0 afghanistan 2017-05-29 2017-06-04
1 afghanistan 2017-08-22 2017-08-28
2 armut 2017-07-03 2017-07-09
In [47]:
cluster_ts_df = suggestions_df.groupby(['date', 'cluster'], as_index=False).sum('count')
cluster_ts_df.head(3)
Out[47]:
date cluster count
0 2017-05-29 0 5774
1 2017-05-29 1 646
2 2017-05-29 2 1449
In [48]:
analysis_dfs = []

for i in tqdm(range(len(delays[1:]))):
    test_range = delays[i+1]
    tmp = pd.DataFrame(data=peak_analysis(test_range, sim_df, peaks_df, cluster_ts_df))
    print(tmp[(tmp['hashtag']=='all')&(tmp['category']=='all')])
    analysis_dfs.append(tmp)
  hashtag category  test_range      t      p
0     all      all           7 -0.115  0.908
  hashtag category  test_range      t      p
0     all      all          14 -0.086  0.931
  hashtag category  test_range      t      p
0     all      all          21 -0.105  0.916
  hashtag category  test_range      t      p
0     all      all          28  0.596  0.551
  hashtag category  test_range      t      p
0     all      all          35  0.572  0.567
  hashtag category  test_range      t      p
0     all      all          42  0.925  0.355
  hashtag category  test_range      t      p
0     all      all          49  0.816  0.415
  hashtag category  test_range      t     p
0     all      all          56  0.773  0.44
  hashtag category  test_range      t     p
0     all      all          63  0.739  0.46
  hashtag category  test_range      t      p
0     all      all          70  0.709  0.478

Keine signifikanten Ergebnisse über alle Kategorien. Erwartbar, dass sich Themen unterschiedlicher Kategorien unterschiedlich verhalten.

In [49]:
# save files
for i in range(len(analysis_dfs)):
    analysis_dfs[i].to_json(f'../../data/Analysis/peak_df_{delays[i]}_range.json')
In [50]:
# set to *.json to load all
input_loc = '../../data/Analysis/*range.json'
input_files = glob.glob(input_loc)

analysis_dfs = []
for file in input_files:
    data = pd.read_json(file)
    analysis_dfs.append(data)
In [51]:
# prepare plots
categories = cluster_cat['category'].unique().tolist()

plot_df = {'category':[], 'test_range':[], 't':[], 'p':[]}

for category in categories:
    for i in range(len(analysis_dfs)):
        tmp = analysis_dfs[i][analysis_dfs[i]['category']==category]
        try:
            plot_df['test_range'].append(int(tmp['test_range'].values))
            plot_df['t'].append(float(tmp['t'].values))
            plot_df['p'].append(float(tmp['p'].values))
            plot_df['category'].append(category)
        except:
            pass
        
plot_df = pd.DataFrame(data=plot_df)
In [53]:
# filter plot to signficant values
plot_df = plot_df[plot_df['p']<=0.05]
plot_df['test_range'] = plot_df['test_range'] / 7
fig = make_subplots(rows=2, cols=4, shared_yaxes='all', shared_xaxes='all', 
                    subplot_titles=['Berufe', 'Medizin', 'Organisationen',
                                    'Orte', 'Personen', 'Politik', 'Wirtschaft'])

fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Berufe']['test_range'],
                     y=plot_df[plot_df['category']=='Berufe']['t'],
                     name='Berufe',
                     marker_color=px.colors.qualitative.Antique[0]),
              row=1, col=1)

fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Medizin']['test_range'],
                     y=plot_df[plot_df['category']=='Medizin']['t'],
                     name='Medizin',
                     marker_color=px.colors.qualitative.Antique[3]),
              row=1, col=2)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Organisationen']['test_range'],
                     y=plot_df[plot_df['category']=='Organisationen']['t'],
                     name='Organisationen',
                     marker_color=px.colors.qualitative.Antique[4]),
              row=1, col=3)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Orte']['test_range'],
                     y=plot_df[plot_df['category']=='Orte']['t'],
                     name='Orte',
                     marker_color=px.colors.qualitative.Antique[5]),
              row=1, col=4)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Personen']['test_range'],
                     y=plot_df[plot_df['category']=='Personen']['t'],
                     name='Personen',
                     marker_color=px.colors.qualitative.Antique[6]),
              row=2, col=1)


fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Politik']['test_range'],
                     y=plot_df[plot_df['category']=='Politik']['t'],
                     name='Politik',
                     marker_color=px.colors.qualitative.Antique[7]),
              row=2, col=2)

fig.add_trace(go.Bar(x=plot_df[plot_df['category']=='Wirtschaft']['test_range'],
                     y=plot_df[plot_df['category']=='Wirtschaft']['t'],
                     name='Wirtschaft',
                     marker_color=px.colors.qualitative.Antique[10]),
              row=2, col=3)


fig.update_yaxes(title='t', col=1)
fig.update_xaxes(title='Range (in Wochen)', row=2)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15),
                  template='simple_white', showlegend=False)
fig.show()
In [54]:
tmp_dict = {'category': [], 'mean': [], 'std': []}

tmp = cluster_df[cluster_df['cluster'].isin(sim_df['cluster'].unique())]
tmp = tmp.merge(cluster_cat, how='left', on='cluster')
for category in tmp['category'].unique():
    tmp2 = tmp[tmp['category']==category]
    tmp2 = tmp2.groupby('date', as_index=False).mean('cluster_count')
    tmp2['cluster_count'] = (tmp2['cluster_count'] - tmp2['cluster_count'].min())/ (tmp2['cluster_count'].max() - tmp2['cluster_count'].min())
    tmp_dict['category'].append(category)
    tmp_dict['mean'].append(tmp2["cluster_count"].mean())
    tmp_dict['std'].append(tmp2["cluster_count"].std())

merge_df = pd.DataFrame(data=tmp_dict)   

plot_df = plot_df.merge(merge_df, how='left', on='category')

plot_df.groupby('category', as_index=False).mean()[['category', 'mean', 'std']]
Out[54]:
category mean std
0 Berufe 0.705343 0.228664
1 Medizin 0.438447 0.262942
2 Organisationen 0.656587 0.162653
3 Orte 0.769198 0.173594
4 Personen 0.861747 0.162287
5 Politik 0.812361 0.147757
6 Wirtschaft 0.788193 0.177424
In [55]:
fig = px.imshow(plot_df[['t', 'mean', 'std']].corr(), color_continuous_scale='RdBu')
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()